import pandas as pd
import os
import plotly.graph_objs as go
import random
import math
import matplotlib.pyplot as plt
# load data
input_path = '../artworks'
df = pd.read_csv(os.path.join(input_path, 'artists.csv')) # data from kaggle
df1 = pd.read_csv(os.path.join(input_path, 'all_data_info.csv')) # data from wikiart
df.head()
| id | name | years | genre | nationality | bio | wikipedia | paintings | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Amedeo Modigliani | 1884 - 1920 | Expressionism | Italian | Amedeo Clemente Modigliani (Italian pronunciat... | http://en.wikipedia.org/wiki/Amedeo_Modigliani | 193 |
| 1 | 1 | Vasiliy Kandinskiy | 1866 - 1944 | Expressionism,Abstractionism | Russian | Wassily Wassilyevich Kandinsky (Russian: Васи́... | http://en.wikipedia.org/wiki/Wassily_Kandinsky | 88 |
| 2 | 2 | Diego Rivera | 1886 - 1957 | Social Realism,Muralism | Mexican | Diego María de la Concepción Juan Nepomuceno E... | http://en.wikipedia.org/wiki/Diego_Rivera | 70 |
| 3 | 3 | Claude Monet | 1840 - 1926 | Impressionism | French | Oscar-Claude Monet (; French: [klod mɔnɛ]; 14 ... | http://en.wikipedia.org/wiki/Claude_Monet | 73 |
| 4 | 4 | Rene Magritte | 1898 - 1967 | Surrealism,Impressionism | Belgian | René François Ghislain Magritte (French: [ʁəne... | http://en.wikipedia.org/wiki/René_Magritte | 194 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50 entries, 0 to 49 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 50 non-null int64 1 name 50 non-null object 2 years 50 non-null object 3 genre 50 non-null object 4 nationality 50 non-null object 5 bio 50 non-null object 6 wikipedia 50 non-null object 7 paintings 50 non-null int64 dtypes: int64(2), object(6) memory usage: 3.2+ KB
From above printing, we can know there is no missing value in the dataset.
# append paths of images to the dataframe
image_path = os.path.join(input_path, 'images/images')
artist_list = sorted(os.listdir(image_path))
artist_list.remove('Albrecht_DuÔòá├¬rer') # remove one of the same names
df_images = pd.DataFrame(columns=['name', 'image_path'], dtype=object)
n_images = 0
for artist in artist_list:
artist_images = os.listdir(os.path.join(image_path, artist))
n_images += len(artist_images)
df_images = pd.concat([df_images, pd.DataFrame({'name': ' '.join(artist.split('_')), 'image_path': [sorted(artist_images)]})], axis=0)
df_images.index = range(50)
df_images.loc[df_images.name == 'Albrecht Du╠êrer', 'name'] = 'Albrecht Dürer'
df = df.merge(df_images, left_on='name', right_on='name')
df.head()
| id | name | years | genre | nationality | bio | wikipedia | paintings | image_path | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Amedeo Modigliani | 1884 - 1920 | Expressionism | Italian | Amedeo Clemente Modigliani (Italian pronunciat... | http://en.wikipedia.org/wiki/Amedeo_Modigliani | 193 | [Amedeo_Modigliani_1.jpg, Amedeo_Modigliani_10... |
| 1 | 1 | Vasiliy Kandinskiy | 1866 - 1944 | Expressionism,Abstractionism | Russian | Wassily Wassilyevich Kandinsky (Russian: Васи́... | http://en.wikipedia.org/wiki/Wassily_Kandinsky | 88 | [Vasiliy_Kandinskiy_1.jpg, Vasiliy_Kandinskiy_... |
| 2 | 2 | Diego Rivera | 1886 - 1957 | Social Realism,Muralism | Mexican | Diego María de la Concepción Juan Nepomuceno E... | http://en.wikipedia.org/wiki/Diego_Rivera | 70 | [Diego_Rivera_1.jpg, Diego_Rivera_10.jpg, Dieg... |
| 3 | 3 | Claude Monet | 1840 - 1926 | Impressionism | French | Oscar-Claude Monet (; French: [klod mɔnɛ]; 14 ... | http://en.wikipedia.org/wiki/Claude_Monet | 73 | [Claude_Monet_1.jpg, Claude_Monet_10.jpg, Clau... |
| 4 | 4 | Rene Magritte | 1898 - 1967 | Surrealism,Impressionism | Belgian | René François Ghislain Magritte (French: [ʁəne... | http://en.wikipedia.org/wiki/René_Magritte | 194 | [Rene_Magritte_1.jpg, Rene_Magritte_10.jpg, Re... |
# merge two datasets
df = df.merge(df1, left_on='name', right_on='artist').drop(['paintings','genre_x', 'artist'], axis=1).rename(columns = {'genre_y': 'genre'})
top_30_artists = df.name.value_counts()[:30].index # get the top 30 artists based on the number of paintings
df = df[df.name.apply(lambda x: x in top_30_artists)]
df.head()
| id | name | years | nationality | bio | wikipedia | date | genre | pixelsx | pixelsy | size_bytes | source | style | title | artist_group | in_train | new_filename | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Amedeo Modigliani | 1884 - 1920 | Italian | Amedeo Clemente Modigliani (Italian pronunciat... | http://en.wikipedia.org/wiki/Amedeo_Modigliani | 1916 | nude painting (nu) | 3200.0 | 2200.0 | 494416.0 | wikiart | Expressionism | Nude on sofa (Almaisa) | train_and_test | False | 34153.jpg |
| 1 | 0 | Amedeo Modigliani | 1884 - 1920 | Italian | Amedeo Clemente Modigliani (Italian pronunciat... | http://en.wikipedia.org/wiki/Amedeo_Modigliani | 1917 | nude painting (nu) | 3200.0 | 2065.0 | 512481.0 | wikiart | Expressionism | Reclining nude with folded arms behind her head | train_and_test | False | 28610.jpg |
| 2 | 0 | Amedeo Modigliani | 1884 - 1920 | Italian | Amedeo Clemente Modigliani (Italian pronunciat... | http://en.wikipedia.org/wiki/Amedeo_Modigliani | 1916 | portrait | 2024.0 | 2908.0 | 516386.0 | wikiart | Expressionism | Portrait of Monsieur Lepoutre | train_and_test | True | 81931.jpg |
| 3 | 0 | Amedeo Modigliani | 1884 - 1920 | Italian | Amedeo Clemente Modigliani (Italian pronunciat... | http://en.wikipedia.org/wiki/Amedeo_Modigliani | 1917 | portrait | 2024.0 | 2852.0 | 439954.0 | wikiart | Expressionism | Portrait of the Mechanical | train_and_test | False | 50579.jpg |
| 4 | 0 | Amedeo Modigliani | 1884 - 1920 | Italian | Amedeo Clemente Modigliani (Italian pronunciat... | http://en.wikipedia.org/wiki/Amedeo_Modigliani | 1914 | portrait | 2024.0 | 2836.0 | 472230.0 | wikiart | Expressionism | Portrait of Diego Rivera | train_and_test | True | 27829.jpg |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 10779 entries, 0 to 12441 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 10779 non-null int64 1 name 10779 non-null object 2 years 10779 non-null object 3 nationality 10779 non-null object 4 bio 10779 non-null object 5 wikipedia 10779 non-null object 6 date 9738 non-null object 7 genre 10753 non-null object 8 pixelsx 10779 non-null float64 9 pixelsy 10779 non-null float64 10 size_bytes 10779 non-null float64 11 source 10779 non-null object 12 style 10775 non-null object 13 title 10779 non-null object 14 artist_group 10779 non-null object 15 in_train 10779 non-null bool 16 new_filename 10779 non-null object dtypes: bool(1), float64(3), int64(1), object(12) memory usage: 1.4+ MB
From above printing, we can know there is also some missing value for style, genre, and date. We drop the 4 paintings that have no style information and keep the rest.
df.dropna(axis=0,subset=['style'],inplace=True)
# date data can be in different formats and can be missing
def process_date(x):
if isinstance(x,float):
if math.isnan(x):
return 0
return int(x)
elif x[0]=='c':
return int(float(x[2:]))
else:
return int(float(x))
df['date'] = df['date'].apply(lambda x: process_date(x))
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 10775 entries, 0 to 12441 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 10775 non-null int64 1 name 10775 non-null object 2 years 10775 non-null object 3 nationality 10775 non-null object 4 bio 10775 non-null object 5 wikipedia 10775 non-null object 6 date 10775 non-null int64 7 genre 10750 non-null object 8 pixelsx 10775 non-null float64 9 pixelsy 10775 non-null float64 10 size_bytes 10775 non-null float64 11 source 10775 non-null object 12 style 10775 non-null object 13 title 10775 non-null object 14 artist_group 10775 non-null object 15 in_train 10775 non-null bool 16 new_filename 10775 non-null object dtypes: bool(1), float64(3), int64(2), object(11) memory usage: 1.4+ MB
After some preprocessing, there are no missing value for date and style now. Missing value for date are filled with 0.
n_artists = len(df.name.unique())
n_images = len(df)
print('There are totally {} artists in the dataset'.format(n_artists))
print('There are totally {} paintings in the dataset'.format(n_images))
There are totally 30 artists in the dataset There are totally 10775 paintings in the dataset
# a function to randomly choose colors
def random_colors(number):
return ['#'+''.join([random.choice('0123456789ABCDEF') for m in range(6)]) for n in range(number)]
paintings = df.name.value_counts()
plt.figure(figsize=(20,10))
plt.bar(paintings.index, paintings.values, color=random_colors(50))
plt.xlabel('Artist', fontsize=20)
plt.xticks(rotation=90, fontsize=15)
plt.ylabel('Number of Paintings', fontsize=20)
plt.title('Paintings per Artist', fontsize=20)
plt.grid(axis='y')
nationality = df[['name','nationality']].drop_duplicates().nationality.value_counts()
plt.figure(figsize=(20,10))
plt.bar(nationality.index, nationality.values, color=random_colors(50))
plt.xlabel('Country', fontsize=20)
plt.xticks(rotation=90, fontsize=15)
plt.ylabel('Number of Painters', fontsize=20)
plt.title('Painters per Country', fontsize=20)
plt.grid(axis='y')
plt.figure(figsize=(20,10))
df[['name', 'style']].drop_duplicates()['style'].value_counts().plot.bar(color=random_colors(50))
print('There are totally {} styles.'.format(len(df['style'].unique())))
plt.xlabel('Style', fontsize=20)
plt.xticks(rotation=90, fontsize=15)
plt.ylabel('Number of painters', fontsize=20)
plt.title('Painters per style', fontsize=20)
plt.grid(axis='y')
There are totally 33 styles.
plt.figure(figsize=(20,10))
df['style'].value_counts().plot.bar(color=random_colors(50))
#print('There are totally {} styles.'.format(len(df['style'].unique())))
plt.xlabel('Style', fontsize=20)
plt.xticks(rotation=90, fontsize=15)
plt.ylabel('Number of painting', fontsize=20)
plt.title('Paintings per style', fontsize=20)
plt.grid(axis='y')
plt.yscale('log')
import plotly.express as px
fig = px.histogram(df[df.date!=0], x="date", color="style",marginal="rug",title='Histogram Per Style Based on Date')
fig.show()
From the above picture, we can see that painting styles do evolve through out history.
# display one painting of each artist
plt.figure(figsize=(50,50))
for i, name in enumerate(df['name'].unique()):
images = df[df['name'] == name].iloc[0]['image_path']
index = random.randint(0, len(images)-1)
image_path = os.path.join('artworks/resized/resized', str(images[index]))
image = plt.imread(image_path)
plt.subplot(5,6, i+1)
plt.axis('off')
plt.title('Author: {}'.format(name), fontsize=30)
plt.imshow(image)
plt.show()